#!/usr/bin/env python

import os
import sys
import hashlib
import logging
import config
import argparse
from tools import parseutility as parser
from tools import file_helper

# Import from parent directory
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
try:
    from hmark.parseutility import normalize
except ImportError:
    from tools.parseutility import normalize

def remove_vuln_dup():
    hashdict = {}
    cntdict = {}
    vulcntlist = []
    repolist = []

    originalDir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))  # vuddy root directory
    vulsDir = os.path.join(originalDir, "vul")
    dirs = os.listdir(vulsDir)
    dirs.sort()
    os.chdir(vulsDir)
    for d in dirs:
        if os.path.isdir(d):
            repolist.append(d)
            cntdict[d] = 0
            # print d
            # print repolist
            vulcntlist.append(len(os.listdir(d)))
            # print vulcntlist
            for vul in os.listdir(d):
                if vul.endswith("OLD.vul"):
                    with open(os.path.join(d, vul), "r", encoding='utf-8') as fp:
                        text = '\n'.join(fp.readlines())
                        text = normalize(text).encode('utf-8')
                        checksum = hashlib.md5(text).hexdigest()
                        try:
                            hashdict[checksum].append(d + ' ' + vul)
                        except:
                            hashdict[checksum] = [d + ' ' + vul]

    cnt = 0

    for key in hashdict:
        if len(hashdict[key]) > 1:
            for vul in hashdict[key][1:]:
                cnt += 1
                repo = vul.split(' ')[0]
                rest = vul.split(' ')[1]
                base = rest[:-8]
                cntdict[repo] += 1
                os.remove(os.path.join(repo, rest))
                try:
                    os.remove(os.path.join(repo, base + "_NEW.vul"))
                    os.remove(os.path.join(repo, base + ".patch"))
                except:
                    pass

    logging.info("[RESULT]")
    for idx, r in enumerate(repolist):
        logging.info(
            '\t' + r + ":\tdeleted " + str(cntdict[r]) + " duplicate files from " + str(vulcntlist[idx]) + " files.")

    logging.info("Total: " + str(cnt) + " duplicate files.")


def remove_abs_dup():
    deserializer: parser.Deserializer = parser.FuncFeatures.get_deserializer()
    abs_path = os.path.join(config.ROOT_PATH, 'abs')
    feature_files = file_helper.extract_in_files(file_helper.get_all_file_paths(abs_path), ['.features'])
    logging.info('Loaded %s abs feature files in total.', len(feature_files))
    hash_dict = {}
    for file in feature_files:
        with open(file, 'r', encoding=config.ENCODING) as f:
            content = f.read().strip()
        func_feature: parser.FuncFeatures = deserializer.deserialize(content)
        hash_val = func_feature.get_feature('abs_func_body_hash')
        try:
            hash_dict[hash_val].append(file)
        except Exception as e:
            hash_dict[hash_val] = [file]

    logging.info('Total %s vuln func features and changes files need to be deleted.', (len(feature_files) - len(hash_dict)))
    repo_count = {}
    for hash_val, file_list in hash_dict.items():
        if len(file_list) > 1:
            for feature_file in file_list[1:]:
                repo_name = os.path.split(feature_file)[0].split(os.sep)[-1]
                change_file = feature_file.rstrip('.features') + '.changes'
                try:
                    repo_count[repo_name] += 1
                except Exception as e:
                    repo_count[repo_name] = 1
                os.remove(feature_file)
                os.remove(change_file)
    for repo_name, count in repo_count.items():
        logging.info('Removed %s files in repo %s', count, repo_name)


if __name__ == '__main__':
    config.conf_log()
    argument_parser = argparse.ArgumentParser()
    argument_parser.add_argument('-a', '--remove_abs', dest='remove_abs', action='store_true',
                                 help='Specifies if removes just abs files.')

    args = argument_parser.parse_args()
    if args.remove_abs:
        remove_abs_dup()
    else:
        remove_vuln_dup()